Introduction

For this analysis we are using the Climate Change: Earth Surface Temperature Data from kaggle. This data set can be found here: https://www.kaggle.com/berkeleyearth/climate-change-earth-surface-temperature-data#GlobalLandTemperaturesByCountry.csv

The goal is to show how the fxtract package can support climate and economical analysis.

Load required packages

library(fxtract)
library(lubridate)
library(tidyverse)
library(stringr)

Read in data

df_glob_temp = read.csv("GlobalLandTemperaturesByCountry.csv")
str(df_glob_temp)
## 'data.frame':    577462 obs. of  4 variables:
##  $ dt                           : Factor w/ 3239 levels "1743-11-01","1743-12-01",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ AverageTemperature           : num  4.38 NA NA NA NA ...
##  $ AverageTemperatureUncertainty: num  2.29 NA NA NA NA ...
##  $ Country                      : Factor w/ 243 levels "Ã…land","Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...

Data preprocessing

For our analysis we consider only years since 1900. The main reason for this is that in recent years we have had many missing values for some countries.

df_glob_temp$dt = as.Date(df_glob_temp$dt)
df1900_raw = df_glob_temp %>% filter(dt >= "1900-01-01")

Removal of duplicates and some smaller regions.

df1900 = df1900_raw %>% filter(!Country %in% c('Denmark', 'Antarctica', 'France', 'Europe', 'Netherlands',
  'United Kingdom', 'South America', 'Ã…land', 'Africa',  'Asia', 'Baker Island', 'Curaçao', 'Kingman Reef', 'North America', 'Oceania', 
  'Palmyra Atoll', 'Saint Barthélemy', 'Saint Martin', 'Virgin Islands'))
df1900$Country = gsub("\\(Europe)", "", df1900$Country)

For an annual view we would like to have the year in a separate column.

df1900$year = year(df1900$dt)

fxtract

Create our user defined-functions

temp_stats = function(data) {
  allYears = unique(data$year)
  data = data %>% group_by(year) %>%
    summarise(
      mean = mean(AverageTemperature, na.rm = T),
      sd = sd(AverageTemperature, na.rm = T),
      min = min(AverageTemperature, na.rm = T),
      max = max(AverageTemperature, na.rm = T)
    )
  data = data %>% as.data.frame()
  res = c(mean = data$mean,
     sd = data$sd,
     min = data$min,
     max = data$max)
  allYears = unique(df1900$year)
  newnames = c(paste0("mean_", allYears), 
    paste0("sd_", allYears), 
    paste0("min_", allYears), 
    paste0("max_", allYears)) 
  names(res) = newnames
  res
}

Setup and calculation

xtractor = Xtractor$new("xtractor")
xtractor$n_cores = 2
xtractor$add_data(df1900, group_by = "Country")
xtractor$add_feature(temp_stats)
xtractor$calc_features()

Results

library(knitr)
library(kableExtra)
res = xtractor$results %>% gather(key = "key", value = "value", -Country) %>% 
  separate(key, c("key", "year")) %>% select(Country, year, key, value) %>% 
  as.data.frame()
res$value[is.infinite(res$value)] <- NA 

res_new = res %>% spread("key", "value") %>% select_("Country", "year", "mean", "sd", "min", "max")

res_new %>% slice(1:20) %>% kable(col.names = c("Country", "Year", "Mean", "SD", "Min", "Max")) %>% 
    kable_styling() %>%
    scroll_box(width = "100%", height = "400px")
Country Year Mean SD Min Max
Afghanistan 1900 13.74933 10.098000 -3.428 27.333
Afghanistan 1901 13.89400 9.120195 0.528 26.448
Afghanistan 1902 14.50583 9.050495 2.293 26.337
Afghanistan 1903 12.98642 10.010542 -1.387 25.909
Afghanistan 1904 13.80575 9.530021 -3.088 26.434
Afghanistan 1905 13.38567 10.428008 -1.240 26.836
Afghanistan 1906 13.88492 9.726306 -0.163 26.730
Afghanistan 1907 13.35292 9.221595 1.058 26.430
Afghanistan 1908 13.69917 9.358339 1.326 27.152
Afghanistan 1909 13.92542 9.392898 -1.224 26.016
Afghanistan 1910 13.21650 9.851335 -0.979 26.149
Afghanistan 1911 13.38500 9.991634 -1.126 25.984
Afghanistan 1912 13.90267 9.301097 0.462 27.644
Afghanistan 1913 13.85925 9.525783 1.450 27.089
Afghanistan 1914 14.29683 9.523955 0.967 26.627
Afghanistan 1915 14.91458 9.556879 2.135 26.475
Afghanistan 1916 13.34300 9.548225 0.740 26.292
Afghanistan 1917 13.91650 9.340447 0.868 26.380
Afghanistan 1918 13.48325 10.032897 -0.010 26.923
Afghanistan 1919 13.88092 10.006089 -1.552 27.191

Visualization

Linear Regression with fxtract

User defined functions

limo_slope = function(data){
  lin_model = lm(temp ~ year, data = data)
  slope = lin_model$coefficients[[2]]
  c("slope" = slope)
}

Setup and calculation

df = res %>% filter(key == "mean") %>% select(temp = value, everything())
df$year = as.numeric(df$year)
xtractor2 = Xtractor$new("xtractor2")
xtractor2$n_cores = 2
xtractor2$add_data(df, group_by = "Country")
xtractor2$add_feature(limo_slope)
xtractor2$calc_features()
res2 = xtractor2$results

Results

Country Slope
Afghanistan 0.0145613
Albania 0.0080569
Algeria 0.0125315
American Samoa 0.0095860
Andorra 0.0117471
Angola 0.0089852
Anguilla 0.0107910
Antigua And Barbuda 0.0109234
Argentina 0.0083684
Armenia 0.0143635
Aruba 0.0098219
Australia 0.0087285
Austria 0.0120316
Azerbaijan 0.0156783
Bahamas 0.0091516
Bahrain 0.0137320
Bangladesh 0.0071229
Barbados 0.0111324
Belarus 0.0138405
Belgium 0.0105878



Visualization

For a better overview, we can then plot our results in a map.